#   LibAut
#       onset regressions, multiple, with missing data imputation
#       Table C4
#   Juraj Medzihorsky
#   2021-08-14

library(Amelia)
library(logistf)
library(arm)
library(parallel)
options(mc.cores=14) # written for a 16-thread linux machine
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 4.0.4 (2021-02-15)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 20.04.2 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.13.so
##
##  locale:
##   [1] LC_CTYPE=C.UTF-8           LC_NUMERIC=C
##   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C.UTF-8
##   [7] LC_PAPER=C.UTF-8           LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C
##  [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods
##  [8] base
##
##  other attached packages:
##  [1] arm_1.11-2     lme4_1.1-26    Matrix_1.3-2   MASS_7.3-53.1
##  [5] logistf_1.24   Amelia_1.8.0   Rcpp_1.0.7     nvimcom_0.9-92
##
##  loaded via a namespace (and not attached):
##   [1] statmod_1.4.35       xfun_0.21            tidyselect_1.1.0
##   [4] purrr_0.3.4          splines_4.0.4        operator.tools_1.6.3
##   [7] lattice_0.20-41      colorspace_2.0-0     vctrs_0.3.6
##  [10] generics_0.1.0       htmltools_0.5.1.1    mgcv_1.8-34
##  [13] base64enc_0.1-3      survival_3.2-10      rlang_0.4.10
##  [16] pillar_1.4.7         nloptr_1.2.2.2       foreign_0.8-81
##  [19] glue_1.4.2           DBI_1.1.1            RColorBrewer_1.1-2
##  [22] jpeg_0.1-9           lifecycle_1.0.0      stringr_1.4.0
##  [25] munsell_0.5.0        gtable_0.3.0         htmlwidgets_1.5.3
##  [28] coda_0.19-4          knitr_1.31           latticeExtra_0.6-29
##  [31] htmlTable_2.2.1      broom_0.7.5          checkmate_2.0.0
##  [34] backports_1.2.1      scales_1.1.1         Hmisc_4.5-0
##  [37] abind_1.4-5          gridExtra_2.3        digest_0.6.27
##  [40] ggplot2_3.3.3        png_0.1-7            stringi_1.5.3
##  [43] formula.tools_1.7.1  dplyr_1.0.4          grid_4.0.4
##  [46] tools_4.0.4          magrittr_2.0.1       tibble_3.0.6
##  [49] Formula_1.2-4        mice_3.13.0          cluster_2.1.1
##  [52] crayon_1.4.1         tidyr_1.1.2          pkgconfig_2.0.3
##  [55] ellipsis_0.3.1       data.table_1.14.0    rstudioapi_0.13
##  [58] assertthat_0.2.1     minqa_1.2.4          R6_2.5.0
##  [61] boot_1.3-27          rpart_4.1-15         nnet_7.3-15
##  [64] nlme_3.1-152         compiler_4.0.4

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)
figs_dir <- gsub('code$', 'figures', code_dir)
tabs_dir <- gsub('code$', 'tables', code_dir)

setwd(data_dir)
dat <- readRDS('onset_20210810.rds')
eps <- read.csv('eps.csv')
setwd(code_dir)

dat <- subset(dat, onset_eligible%in%1)

#------------------------------------------------------------------------------
#   get EP shares
tab_dem <- aggregate(eps$dem_ep, by=list(eps$year), FUN=mean, na.rm=T)
tab_aut <- aggregate(eps$aut_ep, by=list(eps$year), FUN=mean, na.rm=T)
names(tab_dem) <- c('year', 'share_dem_ep')
names(tab_aut) <- c('year', 'share_aut_ep')

tab <- merge(tab_dem, tab_aut, by='year')

tab_lag <- tab
tab_lag$year <- tab$year + 1
names(tab_lag)[2:3] <- paste0('lag1_', names(tab)[2:3])

tab_tab <- merge(tab, tab_lag)

dat <- merge(dat, tab_tab, by='year')

#------------------------------------------------------------------------------
#   get outcome

#   recode outcomes
#   epy: suc 1 fail 0
dat$epy <- as.numeric(rep(NA,nrow(dat)))
#   success: 1, 5
dat$epy[dat$dem_ep_outcome%in%c(1,5)] <- 1
#   fail: 2,3,4
dat$epy[dat$dem_ep_outcome%in%c(2,3,4)] <- 0
#   censored: 6
dat$epy[dat$dem_ep_outcome%in%6] <- NA

#   with(dat, table(epy, dem_ep_outcome, useNA='a'))
#   sum(!is.na(dat$epy))

#------------------------------------------------------------------------------

dat$logpop <- log(dat$e_mipopula) 

dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#   with(dat, table(reg_fac, e_regionpol_6C))

reglabs <- c('Eastern Europe & Central Asia',
             'Latin America &the Caribbean',
             'MENA, ISR, TUR',
             'Sub-Saharan Africa',
             'Western Europe, North America, CYP, AUS, NZ',
             'Asia & Pacific')

#------------------------------------------------------------------------------

dat_old <- dat

#   filter out columns
vars_to_go <- 
    c('epy',
      'libaut_start', 
      'year',
      'reg_fac', 
      'lag1_e_migdppcln',  
      'lag1_e_migdpgro',  
      'lag1_logpop', 
      'lag1_v2pepwrsoc',  
      'lag1_v2xeg_eqdr',  
      'lag1_v2xnp_pres',  
      'lag1_v2csprtcpt', 
      'lag1_v2x_polyarchy', 
      'lag1_excl_region_edi',  
      'lag1_e_miinteco',  
      'lag1_e_miinterc', 
      'lag1_share_dem_ep', 
      'lag1_share_aut_ep')  

dat <- dat[, c(vars_to_go, 'country_text_id', 'dem_ep_id')]

#------------------------------------------------------------------------------

bins <- c('lag1_e_miinteco', 'lag1_e_miinterc')#, 'libaut_start') 
cats <- c('country_text_id', 'dem_ep_id', 'reg_fac')


set.seed(123)
system.time(
    a <- amelia(dat, m=100, polytime=3, 
                idvars=c('dem_ep_id', 'libaut_start', 'epy'), 
                ts='year', cs='country_text_id', noms=c(bins, 'reg_fac'))
)


#------------------------------------------------------------------------------
#   fit

#   formulas
fy0 <- libaut_start ~ 1
#
lin_1 <- ~ . + 
    reg_fac +
    lag1_e_migdppcln + 
    lag1_e_migdpgro + 
    lag1_logpop +
    lag1_v2pepwrsoc + 
    lag1_v2xeg_eqdr + 
    lag1_v2xnp_pres + 
    lag1_v2csprtcpt +
    lag1_v2x_polyarchy +
    lag1_excl_region_edi + 
    lag1_e_miinteco + 
    lag1_e_miinterc +
    lag1_share_dem_ep +
    lag1_share_aut_ep  
#
lin_2 <- ~ . + 
    I(1e-1*(year-1990)) + I((1e-1*(year-1990))^2) + I((1e-1*(year-1990))^3) 
#
fl1 <- update(fy0, lin_1)
fl2 <- update(fl1, lin_2)

o1 <- mclapply(a$imputations, function(j) glm(fl1, data=j, family=stats::gaussian('identity')))
o2 <- mclapply(a$imputations, function(j) glm(fl2, data=j, family=stats::gaussian('identity')))

l1 <- mclapply(a$imputations, function(j) glm(fl1, data=j, family=stats::binomial('logit')))
l2 <- mclapply(a$imputations, function(j) glm(fl2, data=j, family=stats::binomial('logit')))

mcont <- logistf.control(maxit=1e4, maxstep=1e4)
list_f1 <- mclapply(a$imputations, function(i) logistf(fl1, data=i, control=mcont))
list_f2 <- mclapply(a$imputations, function(i) logistf(fl2, data=i, control=mcont))


bo1 <- do.call(rbind, lapply(o1, coef))
bo2 <- do.call(rbind, lapply(o2, coef))
bl1 <- do.call(rbind, lapply(l1, coef))
bl2 <- do.call(rbind, lapply(l2, coef))
bf1 <- do.call(rbind, lapply(list_f1, function(i) i$coefficients))
bf2 <- do.call(rbind, lapply(list_f2, function(i) i$coefficients))

eo1 <- do.call(rbind, lapply(o1, se.coef))
eo2 <- do.call(rbind, lapply(o2, se.coef))
el1 <- do.call(rbind, lapply(l1, se.coef))
el2 <- do.call(rbind, lapply(l2, se.coef))
ef1 <- do.call(rbind, lapply(list_f1, function(i) diag(i$var)^0.5))
ef2 <- do.call(rbind, lapply(list_f2, function(i) diag(i$var)^0.5))


cmondude <-
    function(x)
    {
        z <- do.call(cbind, lapply(x, as.vector))
        rownames(z) <- colnames(x[[1]])
        return(z)
    }

do1 <- cmondude(mi.meld(bo1, eo1))
do2 <- cmondude(mi.meld(bo2, eo2))
dl1 <- cmondude(mi.meld(bl1, el1))
dl2 <- cmondude(mi.meld(bl2, el2))
df1 <- cmondude(mi.meld(bf1, ef1))
df2 <- cmondude(mi.meld(bf2, ef2))


getci <-
    function(x, dig=2)
    {
        ci <- data.frame(term=rownames(x),
                         estimate=round(x[,1], dig),
                         lo=round(x[,1]-1.96*x[,2], dig),
                         hi=round(x[,1]+1.96*x[,2], dig))
        rownames(ci) <- rownames(x)
        return(ci)
    }


ci_o1 <- getci(do1)
ci_o2 <- getci(do2)
ci_l1 <- getci(dl1)
ci_l2 <- getci(dl2)
ci_f1 <- getci(df1)
ci_f2 <- getci(df2)


bb <- 
    data.frame(o1=c(ci_o1$estimate, rep(NA,3)),
               o2=c(ci_o2$estimate),
               l1=c(ci_l1$estimate, rep(NA,3)),
               l2=c(ci_l2$estimate),
               f1=c(ci_f1$estimate, rep(NA,3)),
               f2=c(ci_f2$estimate))

bt <- paste(apply(bb, 1, paste, collapse=' & '), '\\\\')

ci <-
    data.frame(
        o1=c(paste0('(', ci_o1$lo, ', ', ci_o1$hi, ')'), rep('',3)),
        o2=  paste0('(', ci_o2$lo, ', ', ci_o2$hi, ')'),
        l1=c(paste0('(', ci_l1$lo, ', ', ci_l1$hi, ')'), rep('',3)),
        l2=  paste0('(', ci_l2$lo, ', ', ci_l2$hi, ')'),
        f1=c(paste0('(', ci_f1$lo, ', ', ci_f1$hi, ')'), rep('',3)),
        f2=  paste0('(', ci_f2$lo, ', ', ci_f2$hi, ')'))

ct <- paste(apply(ci, 1, paste, collapse=' & '), '\\\\')

bt <- gsub('-', '$-$', bt)
ct <- gsub('-', '$-$', ct)
bt <- gsub('NA', '', bt)
ct <- gsub('NA', '', ct)

tab <- character(length=4*length(bt))
for (i in 1:length(bt)) {
    ii <- (i-1)*3 + i
    tab[ii] <- as.character(ci_l2$term)[i]
    tab[ii+1] <- paste0(' & ', bt[i])
    tab[ii+2] <- paste0(' & ', ct[i])
}

setwd(tabs_dir)
writeLines(tab, con='Table_C4.tex')

#   SCRIPT END